Let’s explore the data using PCA

library(tidyverse)
library(tictoc)
library(caret)
library(ggfortify)
library(MASS)
library(cluster)
source("./parameters.R")

Open the data set

# Open the bag of words
fileName = "bow_tfidf__min_words_100_2grams_1000__sampling_balanced__cor_cut_0.3_from_1408_to_1110_rm0.csv"
df = read_csv(fileName, col_types=col_types_df)
df = df[,-c(2,3,5:9)]
# During tests, we can work on a sample
sampled = TRUE
if (sampled == TRUE) {
  set.seed(42)
  max = nrow(df)
  sampled = round(max/10)
  df = df[sample(max, sampled), ]
}
# show the data set
df

Splitting the data

PCA is a clustering technique so we can get ride of the train/test split and the labels.

X = df[,-c(1,2)]
X

Run PCA

Build the PCA

PCA_X = princomp(X)
#PCA_X

Show the resulting screenplot

screeplot(PCA_X)

Visualize the result

Project the data in the new space

X_proj = predict(PCA_X)

Plot the data on the 2 first axis

area = 0.1

autoplot(PCA_X, data = df, colour = 'df_toxic', abel = TRUE, shape = FALSE, label.size = 3, loadings = TRUE, loadings.label = TRUE, loadings.label.size = 5) +
  coord_cartesian(xlim=c(-area,area), ylim=c(-area,area))

LS0tDQp0aXRsZTogIlBDQSINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCkxldCdzIGV4cGxvcmUgdGhlIGRhdGEgdXNpbmcgUENBDQoNCg0KYGBge3J9DQpsaWJyYXJ5KHRpZHl2ZXJzZSkNCmxpYnJhcnkodGljdG9jKQ0KbGlicmFyeShjYXJldCkNCmxpYnJhcnkoZ2dmb3J0aWZ5KQ0KbGlicmFyeShNQVNTKQ0KbGlicmFyeShjbHVzdGVyKQ0Kc291cmNlKCIuL3BhcmFtZXRlcnMuUiIpDQpgYGANCg0KIyBPcGVuIHRoZSBkYXRhIHNldA0KDQpgYGB7cn0NCiMgT3BlbiB0aGUgYmFnIG9mIHdvcmRzDQpmaWxlTmFtZSA9ICJib3dfdGZpZGZfX21pbl93b3Jkc18xMDBfMmdyYW1zXzEwMDBfX3NhbXBsaW5nX2JhbGFuY2VkX19jb3JfY3V0XzAuM19mcm9tXzE0MDhfdG9fMTExMF9ybTAuY3N2Ig0KZGYgPSByZWFkX2NzdihmaWxlTmFtZSwgY29sX3R5cGVzPWNvbF90eXBlc19kZikNCmRmID0gZGZbLC1jKDIsMyw1OjkpXQ0KIyBEdXJpbmcgdGVzdHMsIHdlIGNhbiB3b3JrIG9uIGEgc2FtcGxlDQpzYW1wbGVkID0gVFJVRQ0KaWYgKHNhbXBsZWQgPT0gVFJVRSkgew0KICBzZXQuc2VlZCg0MikNCiAgbWF4ID0gbnJvdyhkZikNCiAgc2FtcGxlZCA9IHJvdW5kKG1heC8xMCkNCiAgZGYgPSBkZltzYW1wbGUobWF4LCBzYW1wbGVkKSwgXQ0KfQ0KIyBzaG93IHRoZSBkYXRhIHNldA0KZGYNCmBgYA0KDQojIFNwbGl0dGluZyB0aGUgZGF0YQ0KDQpQQ0EgaXMgYSBjbHVzdGVyaW5nIHRlY2huaXF1ZSBzbyB3ZSBjYW4gZ2V0IHJpZGUgb2YgdGhlIHRyYWluL3Rlc3Qgc3BsaXQgYW5kIHRoZSBsYWJlbHMuDQoNCmBgYHtyfQ0KWCA9IGRmWywtYygxLDIpXQ0KWA0KYGBgDQoNCiMgUnVuIFBDQQ0KDQpCdWlsZCB0aGUgUENBDQoNCmBgYHtyfQ0KUENBX1ggPSBwcmluY29tcChYKQ0KI1BDQV9YDQpgYGANCg0KU2hvdyB0aGUgcmVzdWx0aW5nIHNjcmVlbnBsb3QNCg0KYGBge3J9DQpzY3JlZXBsb3QoUENBX1gpDQpgYGANCiMgVmlzdWFsaXplIHRoZSByZXN1bHQNCg0KUHJvamVjdCB0aGUgZGF0YSBpbiB0aGUgbmV3IHNwYWNlDQoNCmBgYHtyfQ0KWF9wcm9qID0gcHJlZGljdChQQ0FfWCkNCmBgYA0KDQpQbG90IHRoZSBkYXRhIG9uIHRoZSAyIGZpcnN0IGF4aXMNCg0KYGBge3IgZmlnLndpZHRoPTIwfQ0KYXJlYSA9IDAuMQ0KDQphdXRvcGxvdChQQ0FfWCwgZGF0YSA9IGRmLCBjb2xvdXIgPSAnZGZfdG94aWMnLCBhYmVsID0gVFJVRSwgc2hhcGUgPSBGQUxTRSwgbGFiZWwuc2l6ZSA9IDMsIGxvYWRpbmdzID0gVFJVRSwgbG9hZGluZ3MubGFiZWwgPSBUUlVFLCBsb2FkaW5ncy5sYWJlbC5zaXplID0gNSkgKw0KICBjb29yZF9jYXJ0ZXNpYW4oeGxpbT1jKC1hcmVhLGFyZWEpLCB5bGltPWMoLWFyZWEsYXJlYSkpDQpgYGANCg0KDQo=